#!/usr/bin/env python3

"""
usage: youtubedl.py [-h] [--max-downloads MAX_DOWNLOADS]
                    [--max-filesize MAX_FILESIZE] [--ignore-livestreams]
                    [--download-dir DOWNLOAD_DIR] [--block BLOCK]
                    [--timeout TIMEOUT]
                    files

Download videos in Twitter JSON data.

positional arguments:
  files                 json files to parse

optional arguments:
  -h, --help            show this help message and exit
  --max-downloads MAX_DOWNLOADS
                        max downloads per URL
  --max-filesize MAX_FILESIZE
                        max filesize to download (bytes)
  --ignore-livestreams  ignore livestreams which may never end
  --download-dir DOWNLOAD_DIR
                        directory to download to
  --block BLOCK         hostnames to block (repeatable)
  --timeout TIMEOUT     timeout download after n seconds
"""

import os
import sys
import json
import time
import argparse
import logging
import fileinput
import youtube_dl
import multiprocessing as mp

from urllib.parse import urlparse
from datetime import datetime, timedelta
from youtube_dl.utils import match_filter_func

parser = argparse.ArgumentParser(description="Download videos in Twitter JSON data.")
parser.add_argument("--max-downloads", type=int, help="max downloads per URL")

parser.add_argument("--max-filesize", type=int, help="max filesize to download (bytes)")

parser.add_argument(
    "--ignore-livestreams",
    action="store_true",
    default=False,
    help="ignore livestreams which may never end",
)

parser.add_argument(
    "--download-dir", type=str, help="directory to download to", default="youtubedl"
)

parser.add_argument("--block", action="append", help="hostnames to block (repeatable)")

parser.add_argument(
    "--timeout", type=int, default=0, help="timeout download after n seconds"
)

parser.add_argument("files", action="append", help="json files to parse")


def main():
    args = parser.parse_args()

    # make download directory
    download_dir = args.download_dir
    if not os.path.isdir(download_dir):
        os.mkdir(download_dir)

    # setup logger
    log_file = "{}/youtubedl.log".format(download_dir)
    logging.basicConfig(filename=log_file, level=logging.INFO)
    log = logging.getLogger()

    # setup youtube_dl config
    ydl_opts = {
        "format": "best",
        "logger": log,
        "restrictfilenames": True,
        "ignoreerrors": True,
        "nooverwrites": True,
        "writedescription": True,
        "writeinfojson": True,
        "writesubtitles": True,
        "writeautomaticsub": True,
        "outtmpl": "{}/%(extractor)s/%(id)s/%(title)s.%(ext)s".format(download_dir),
        "download_archive": "{}/archive.txt".format(download_dir),
    }
    if args.ignore_livestreams:
        ydl_opts["matchfilter"] = match_filter_func("!is_live")
    if args.max_downloads:
        ydl_opts["max_downloads"] = args.max_downloads
    if args.max_filesize:
        ydl_opts["max_filesize"] = args.max_filesize

    # keep track of domains to block
    blocklist = []
    if args.block:
        blocklist = args.block

    # read in existing mapping file to know which urls we can ignorej
    seen = set()
    mapping_file = os.path.join(download_dir, "mapping.tsv")
    if os.path.isfile(mapping_file):
        for line in open(mapping_file):
            url, path = line.split("\t")
            log.info("found %s in %s", url, mapping_file)
            seen.add(url)

    # loop through the tweets
    results = open(mapping_file, "a")
    for line in fileinput.input(args.files):
        tweet = json.loads(line)
        log.info("analyzing %s", tweet["id_str"])
        for e in tweet["entities"]["urls"]:
            url = e.get("unshortened_url") or e["expanded_url"]

            # see if we can skip this one
            if not url:
                continue
            if url in seen:
                log.info("already processed %s", url)
                continue
            seen.add(url)

            # check for blocks
            uri = urlparse(url)
            if uri.netloc in blocklist:
                logging.warn("%s in block list", url)
                continue

            # set up a multiprocessing queue to manage the download with a timeout
            log.info("processing %s", url)
            q = mp.Queue()
            p = mp.Process(target=download, args=(url, q, ydl_opts, log))
            p.start()

            started = datetime.now()
            while True:
                # if we've exceeded the timeout terminate the process
                if args.timeout and datetime.now() - started > timedelta(
                    seconds=args.timeout
                ):
                    log.warning("reached timeout %s", args.timeout)
                    p.terminate()
                    break
                # if the process is done we can stop
                elif not p.is_alive():
                    break
                # otherwise sleep and the check again
                time.sleep(1)

            # if the queue was empty there either wasn't a download or it timed out
            if q.empty():
                filename = ""
            else:
                filename = q.get()

            p.join()

            # write the result to the mapping file
            results.write("{}\t{}\n".format(url, filename))


def download(url, q, ydl_opts, log):
    try:
        ydl = youtube_dl.YoutubeDL(ydl_opts)
        info = ydl.extract_info(url)
        if info:
            filename = ydl.prepare_filename(info)
            log.info("downloaded %s as %s", url, filename)
        else:
            filename = ""
            logging.warning("%s doesn't look like a video", url)
    except youtube_dl.utils.MaxDownloadsReached as e:
        logging.warning("only %s downloads per url allowed", args.max_downloads)


if __name__ == "__main__":
    main()
